/*
1) Chronic vs. acute classifications
2) Plot line extension timing for chronic markets

*/


/*
Step 1: import SSR Health raw data and mark when line extensions of each drug launch
*/


import excel "US brand Rx net price tool _1Q19_dataonly.xlsx", ///
	sheet("Product data") cellrange(B7:RI1092) firstrow clear // skips some header rows
	
drop if Product == ""	// empty lines

foreach var of varlist _all {
	capture confirm string var `var'
	if _rc != 0 {
		qui sum `var'
		if r(max) == . | (r(max) == 0 & r(min) == 0) {
			disp "`var'"
			drop `var'
			}
		}
	}

* Capitalize string variables
replace Product = trim(upper(Product))
replace Molecule = trim(upper(Molecule))
replace Company = trim(upper(Company))
replace Class = trim(upper(Class))

gen biologic = Size == "Large"
drop Size

rename Marketstart mkt_start
rename Marketend mkt_end
rename Exclude exclude
	
* Reshape to get a panel dataset
reshape long net_sales WAC_sales units WAC_price net_price tot_discount ///
			 medicaid_discount non_medicaid_discount, ///
	i(Company Product Molecule biologic Class ///
	  mkt_start mkt_end LOE exclude) j(qrt) string

* create numeric quarter variable
gen quarter = quarterly(qrt,"Q20Y")
format quarter %tq
drop qrt

* replace sales and units with missing values whenever it is set to 0
replace net_sales = . if net_sales == 0
replace WAC_sales = . if WAC_sales == 0
replace units = . if units == 0

* Drop vaccines
bysort Product : egen vaccine = max(regexm(Molecule, "VACC"))
drop if vaccine == 1
drop vaccine

* Drop variables we will recalculate
// drop WAC_price

compress
save "main_SSR_1Q19.dta", replace




import excel "Strength Form Sales Data_dataonly.xlsx", ///
	sheet("Symphony_") firstrow clear
drop if Keyproduct == ""

rename Keyproduct Product
rename Keystrengthform StrengthForm

drop if inlist(Product, "Corlanor", "Imlygic", "Soliris")	// these don't appear in main dataset

* Fix missing Molecule values
replace Molecule = "Insulin Regular, Human" if Product == "Afrezza"
replace Molecule = "Factor Ix Rec, Fc Fusion Protn" if Product == "Alprolix"
// * There are some more products with missing molecule, but the molecule is 
// * missing across all datasets so we don't impute for now.
// replace Molecule = "Florbetapir" if Product == "Amyvid"
// replace Molecule = "Sufentanil" if Product == "Dsuvia"
// replace Molecule = "Nitric oxide" if Product == "Inomax"
// replace Molecule = "Axicabtagene ciloleucel" if Product == "Yescarta"

* reshape
reshape long WAC_sales units, ///
	i(Product StrengthForm Molecule) j(qrt) string

gen quarter = quarterly(qrt,"Q20Y")
format quarter %tq
drop qrt

drop if WAC_sales == 0
replace Product = upper(trim(Product))
replace Molecule = upper(trim(Molecule))

* Save as a temporary file (more edits need to be made still)
tempfile PSF_temp
save "`PSF_temp'", replace

* Now grab only the product characteristics
keep Product StrengthForm Molecule
duplicates drop
gen data = "PSF"
tempfile PSF_info
save "`PSF_info'", replace

*** END OF PART 1


/*	3. Import mapping of drug name to NDC codes	*/
import excel "US brand Rx net price tool _1Q19_dataonly.xlsx", ///
	sheet("Unit of Measure data") firstrow clear

* drop empty rows
drop if KeyProduct == ""

* Basic maintenance
drop A 			// empty
drop Product 	// the Product variable is not the same as in the other dataset.

rename KeyProduct Product
rename KeyStrengthForm StrengthForm
rename GenericProductName Molecule

replace Product = trim(upper(Product))
replace Molecule = upper(trim(Molecule))


* save as a temporary file, as we'll need to go back to make edits.
tempfile NDC_mapping_temp
save "`NDC_mapping_temp'", replace

* Now create a map of StrengthForm to Strength + Form
keep Product Molecule StrengthForm Strength Form
duplicates drop

gen data = "characteristics"

* append to the PSF info data
append using "`PSF_info'"

* Harmonize the Molecule variable first
bysort Product StrengthForm Molecule : gen tot = _N

bysort Product StrengthForm : gen Molecule2 = Molecule if tot == 2
bysort Product StrengthForm (Molecule2) : replace Molecule2 = Molecule2[_N]

* Now harmonize the Strength and Form and StrengthForm variables (notice that
* we do not expect to fill everything since some values are missing for some
* Products).
bysort Product Molecule2 StrengthForm (Strength) : gen Strength2 = Strength[_N]
bysort Product Molecule2 StrengthForm (Form) : gen Form2 = Form[_N]

bysort Product Molecule2 Strength2 Form2 (StrengthForm) : ///
	gen StrengthForm2 = StrengthForm[_N]

* Now we split the data back into the two components (PSF and characteristics)
* and merge in to the full datasets the new variables, which will became the new
* values after the merge
tempfile walkthrough
save "`walkthrough'", replace


* Merge first with NDC_mapping
keep if data == "characteristics"
merge 1:m Product Molecule StrengthForm Strength Form ///
	using "`NDC_mapping_temp'", nogen	// should be a perfect match

* replace the original variables
foreach var in Molecule StrengthForm Strength Form {
	drop `var'
	rename `var'2 `var'
	}

* Drop vaccines
bysort Product : egen vaccine = max(regexm(Molecule, "VACC"))
drop if vaccine == 1
drop vaccine

// these are vaccines but are missed by the algorithm
drop if inlist(Product, "RABAVERT", "ZOSTAVAX")

* save
save "ndc_product_mapping.dta", replace



use "ndc_product_mapping.dta", clear	// map of Product characteristics to NDC

* create an indicator variable that groups NDC codes with the same StrengthForm
bysort Product Strength Form : gen ind = _n == 1
bysort Product (Strength Form) : gen line_num = sum(ind)
keep Product NDC11 Strength Form line_num

* merge with Medicaid Utilization data
merge 1:m NDC11 using "quarterly_ssr_drug_medicaid_by_NDC.dta"

* calculate earliest quarter for which we see Medicaid prescriptions
bysort Product line_num : egen first_qrt = min(quarter)
format first_qrt %tq

keep Product Strength Form line_num first_qrt
duplicates drop

drop if first_qrt == .	// these drugs never appear in the Medicaid data

* Create indicators for whether a new line has a new strength or a new form
bysort Product Form (first_qrt) : gen new_form = _n == 1
bysort Product Strength (first_qrt) : gen new_str = _n == 1

* save temporary dataset identifying Form and Strength associated with line 
* extensions
save "new_products_form_strength.dta", replace


* collapse to get a Product-quarter dataset
gen num_new_lines = 1
collapse (sum) num_new_lines ///
		 (max) new_form new_str, by(Product first_qrt)
rename first_qrt quarter

* now, for each product, drop the first entry (which is presumably the 
* originator product, and not a line extension) and all entries within a year of 
* that first entry.
bysort Product : egen first_qrt = min(quarter)
drop if quarter < first_qrt + 4
drop first_qrt

* now drop line extensions that were introduced prior to 2007q1
drop if quarter < tq(2007q1)

* save
save "line_extensions.dta", replace

/*
Step 2: classify SSR Health drugs into chronic and acute 
*/

clear
gen chronic = 1
save meps_ndc_class_mapping, replace

forvalues y=2002/2016 {

	clear
	use P`y'
	
	keep RXNDC RXNAME TC1 TC1S1
	drop if RXNDC == ""
	
	gen scripts = 1
	collapse (sum) scripts (first) RXNAME, by(RXNDC TC1 TC1S1)
	
	append using meps_ndc_class_mapping
	save meps_ndc_class_mapping, replace
	
}


drop if RXNDC == "-9" | RXNDC == "-7" | RXNDC == "-8"

collapse (sum) scripts (first) RXNAME, by(RXNDC TC1 TC1S1)

tab TC1

save meps_ndc_class_mapping, replace






clear
use meps_ndc_class_mapping

collapse (sum) scripts, by(TC1 TC1S1)

gsort - scripts 

save MEPS_TC1S1_classes, replace // use this to manually assign chronic/acute



clear
use meps_ndc_class_mapping

merge m:1 TC1 using TC1_chronic, keep(1 3)
drop _merge 
rename chronic chronic_tc1
merge m:1 TC1S1 using TC1S1_chronic, keep(1 3)
drop _merge
rename chronic chronic_tc1s1

drop if chronic_tc1 == . & chronic_tc1s1 == .

gen chronic = chronic_tc1
replace chronic = chronic_tc1s1 if chronic == .


gsort chronic - scripts
save meps_ndc_chronic_mapping, replace



// Final Step: use NDC to bridge to Product name
clear
use meps_ndc_chronic_mapping

// generally consistent within group, a few disagreements
gsort RXNDC - scripts
egen t = tag(RXNDC)
keep if t == 1
drop t

rename RXNDC NDC11

merge 1:1 NDC11 using "ndc_product_mapping.dta" 
keep if _merge == 3
drop _merge

collapse (mean) chronic, by(Product)

replace chronic = round(chronic)

save ssr_product_chronic_classification, replace





/*
Step 3: final analysis
*/


use "main_SSR_1Q19.dta", clear 


// NEW: try merging on additional information
merge m:1 Product using "ssr_product_chronic_classification"
keep if _merge == 3
drop _merge

merge m:1 Product quarter using "line_extensions"
keep if _merge == 1 | _merge == 3
drop _merge


keep if chronic != .
replace chronic = 1 if Class == "HIV"


foreach v in num_new_lines new_form new_str {
    
	replace `v' = 0 if `v' == .
}

gen le_ind = num_new_lines > 0	// we also use new_form and new_str
gen rel_loe = (quarter - qofd(LOE))

//collapse (mean) le_ind, by(rel_loe chronic)
keep if rel_loe >= -30 & rel_loe != .
keep if le_ind == 1

tab chronic


reg rel_loe chronic, r
reg rel_loe chronic if rel_loe >= -12, r


tab rel_loe if chronic == 0


kdensity rel_loe if chronic == 1, xtitle("Quarters Before Generic Entry") ytitle("Density") title("")
graph save chronic_le_v2, replace
graph export chronic_le_v2.pdf, replace

